df <- readRDS("../data/models/social-risk-crash-rate-data.rds")
YEARI will treat year as a factor and one-hot encode it.
df$year <- as.factor(df$year)
year_dummies <- model.matrix(~ year - 1, data = df)
df <- cbind(df[ , !(names(df) %in% c("year"))], year_dummies)
We will remove all possible target variables and keep only one per model training.
# Choose your target variable (e.g., crash rate per 1,000 residents)
target_var <- "crash_rate_per_1000"
# Remove all target variables except selected
cols_to_remove <- grep("per_1000",
names(df),
value = TRUE)
cols_to_remove <- setdiff(cols_to_remove, target_var) # keep this column
df <- df %>% select(-all_of(cols_to_remove),)
# Create feature matrix and target vector
X <- df %>% select(-target_var, -borough, -total_population, -geoid)
y <- df[[target_var]]
glimpse(X)
## Rows: 13,518
## Columns: 44
## $ pct_male_population <dbl> 12.460865, 11.694881, 12.229106, 13.5…
## $ pct_white_population <dbl> 4.1225202, 3.6815086, 2.5856754, 2.36…
## $ pct_black_population <dbl> 2.435429, 2.630197, 2.833673, 2.41262…
## $ pct_asian_population <dbl> 0.19803330, 0.14388387, 0.23376782, 0…
## $ pct_hispanic_population <dbl> 6.411328, 6.607147, 5.982424, 6.07935…
## $ pct_foreign_born <dbl> 2.909566, 2.442189, 2.187254, 2.20042…
## $ pct_age_under_18 <dbl> 2.130762, 2.643626, 2.333613, 2.38777…
## $ pct_age_18_34 <dbl> 1.715654, 1.653705, 1.882339, 1.96336…
## $ pct_age_35_64 <dbl> 3.296112, 3.079115, 3.041014, 3.04350…
## $ pct_age_65_plus <dbl> 1.80895804, 1.78607845, 1.56929356, 1…
## $ median_income <dbl> 58582.658, 49964.513, 68000.000, 7086…
## $ pct_income_under_25k <dbl> 0.5445916, 0.5544325, 0.5325841, 0.51…
## $ pct_income_25k_75k <dbl> 1.0568123, 1.1376418, 0.9533662, 0.87…
## $ pct_below_poverty <dbl> 1.9574830, 1.9510653, 1.8091597, 1.93…
## $ median_gross_rent <dbl> 1579.1133, 1524.3577, 1701.0000, 1740…
## $ pct_owner_occupied <dbl> 1.33318303, 1.35699756, 1.58439699, 1…
## $ pct_renter_occupied <dbl> 1.2085934, 1.2305140, 1.1518859, 1.20…
## $ pct_less_than_hs <dbl> 1.4509748, 1.1951954, 1.1302166, 1.04…
## $ pct_hs_diploma <dbl> 1.5576081, 1.4196542, 1.2704773, 1.38…
## $ pct_some_college <dbl> 0.8473540, 0.8997538, 0.7256966, 0.93…
## $ pct_associates_degree <dbl> 0.4912749, 0.3280552, 0.3923234, 0.32…
## $ pct_bachelors_degree <dbl> 0.9482748, 0.9457966, 0.8537607, 0.90…
## $ pct_graduate_degree <dbl> 0.3998749, 0.7098271, 1.1037907, 0.96…
## $ pct_in_labor_force <dbl> 3.566504, 3.430191, 3.555304, 3.59599…
## $ pct_not_in_labor_force <dbl> 3.448445, 3.326595, 3.162980, 3.10658…
## $ unemployment_rate <dbl> 15.750133, 13.478747, 10.806175, 11.5…
## $ pct_commute_short <dbl> 0.7350082, 0.4604284, 0.1585556, 0.41…
## $ pct_commute_medium <dbl> 1.7061331, 1.6882374, 1.2521824, 1.25…
## $ pct_commute_long <dbl> 2.477320, 2.685832, 3.553271, 3.73746…
## $ pct_carpool <dbl> 0.35798327, 0.31462606, 0.00000000, 0…
## $ pct_public_transit <dbl> 2.056500, 2.540030, 2.898721, 2.36674…
## $ pct_walk <dbl> 0.37702494, 0.30695226, 0.13009688, 0…
## $ pct_bike <dbl> 0.00000000, 0.00000000, 0.00000000, 0…
## $ pct_work_from_home <dbl> 0.01713750, 0.04604284, 0.04675356, 0…
## $ pct_vehicle <dbl> 2.0165122, 1.9222885, 2.1120415, 2.03…
## $ post_pandemic <int> 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0…
## $ poverty_vehicle_interaction <dbl> 3.9472883, 3.7505104, 3.8210202, 3.94…
## $ unemployment_vehicle_interaction <dbl> 31.760336, 25.910041, 22.823090, 23.4…
## $ year2018 <dbl> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1…
## $ year2019 <dbl> 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0…
## $ year2020 <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ year2021 <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0…
## $ year2022 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0…
## $ year2023 <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0…
What This Does - Optuna (Python) handles the search
space and Bayesian optimization. - The final best parameters are applied
to fit the final_model. - Search space:
Instead of predefined grids, trial$suggest_float() and
trial$suggest_int() explore a range of values. -
Best parameters: study$best_params holds
the optimal hyperparameters.
## CONVERT TO DMATRIX
dtrain_all <- xgb.DMatrix(data = as.matrix(X), label = y)
## Start Python venv
reticulate::use_virtualenv("r-reticulate", required = TRUE)
## OPTUNA-BASED SPATIAL CV
optuna <- import("optuna")
boroughs <- unique(df$borough)
folds <- lapply(boroughs, function(b) which(df$borough != b))
# Optuna objective
objective <- function(trial) {
params <- list(
booster = "gbtree",
eta = trial$suggest_float("eta", 0.01, 0.3, log = TRUE),
max_depth = trial$suggest_int("max_depth", 3, 12),
min_child_weight = trial$suggest_int("min_child_weight", 1, 10),
subsample = trial$suggest_float("subsample", 0.5, 1.0),
colsample_bytree = trial$suggest_float("colsample_bytree", 0.5, 1.0),
gamma = trial$suggest_float("gamma", 0, 10),
lambda = trial$suggest_float("lambda", 0, 10),
alpha = trial$suggest_float("alpha", 0, 10)
)
rmse_scores <- numeric(length(folds))
for (i in seq_along(folds)) {
train_idx <- folds[[i]]
valid_idx <- setdiff(seq_len(nrow(dtrain_all)), train_idx)
dtrain <- xgb.DMatrix(data = as.matrix(X[train_idx, ]), label = y[train_idx])
dvalid <- xgb.DMatrix(data = as.matrix(X[valid_idx, ]), label = y[valid_idx])
model <- xgb.train(
params = params,
data = dtrain,
nrounds = 500,
watchlist = list(val = dvalid),
early_stopping_rounds = 20,
verbose = 0
)
rmse_scores[i] <- min(model$evaluation_log$val_rmse)
}
preds <- predict(model, as.matrix(X[valid_idx, ]))
return(Metrics::rmse(y[valid_idx], preds))
}
# Run Optuna study
set.seed(2025)
study <- optuna$create_study(direction = "minimize")
study$optimize(objective, n_trials = 50)
best_params <- study$best_params
print(best_params)
$eta [1] 0.2309768
$max_depth [1] 4
$min_child_weight [1] 3
$subsample [1] 0.5583975
$colsample_bytree [1] 0.8797697
$gamma [1] 3.35839
$lambda [1] 6.131609
$alpha [1] 2.805485
# Set seed
set.seed(2025)
# Split by index
train_index <- createDataPartition(y, p = 0.8, list = FALSE)
X_train <- X[train_index, ]
y_train <- y[train_index]
X_test <- X[-train_index, ]
y_test <- y[-train_index]
# Convert to xgb.DMatrix
dtrain <- xgb.DMatrix(data = as.matrix(X_train), label = y_train)
dtest <- xgb.DMatrix(data = as.matrix(X_test), label = y_test)
# Set seed
set.seed(2025)
# Training with parallel processing
final_model <- xgb.train(
params = list(
eta = best_params$eta,
max_depth = best_params$max_depth,
gamma = best_params$gamma,
colsample_bytree = best_params$colsample_bytree,
min_child_weight = best_params$min_child_weight,
subsample = best_params$subsample,
objective = "reg:squarederror",
eval_metric = "rmse"
),
data = dtrain,
nrounds = 1000,
watchlist = list(train = dtrain, test = dtest),
early_stopping_rounds = 20,
verbose = 1,
nthread = detectCores() - 1
)
[78] train-rmse:1.220267 test-rmse:1.555561 [79] train-rmse:1.211549 test-rmse:1.556347 [80] train-rmse:1.207152 test-rmse:1.553078 [81] train-rmse:1.205943 test-rmse:1.553145 [82] train-rmse:1.205311 test-rmse:1.552420 [83] train-rmse:1.201741 test-rmse:1.555835 [84] train-rmse:1.197931 test-rmse:1.553410 [85] train-rmse:1.191889 test-rmse:1.551760 [86] train-rmse:1.189975 test-rmse:1.551783 [87] train-rmse:1.188499 test-rmse:1.550743 Stopping. Best iteration: [67] train-rmse:1.258358 test-rmse:1.549545 ## SAVE MODEL
# Create directory if it doesn't exist
if (!dir.exists("../data/models")) {
dir.create("../data/models", recursive = TRUE)
}
# Save the final XGBoost model
saveRDS(final_model, file = "../data/models/xgb_model.rds")
# Save the best parameters
saveRDS(best_params, file = "../data/models/xgb_best_params.rds")
cat("Model and parameters saved to ../data/models/")
# Load the final XGBoost model
final_model <- readRDS("../data/models/xgb_model.rds")
# Load the best parameters
best_params <- readRDS("../data/models/xgb_best_params.rds")
library(Metrics)
library(ggplot2)
library(dplyr)
set.seed(2025)
# Predict on test set
preds <- predict(final_model, as.matrix(X_test))
# --- Metrics ---
rmse <- sqrt(mean((y_test - preds)^2))
mae <- mean(abs(y_test - preds))
mape <- mean(abs((y_test - preds) / y_test)) * 100
r2 <- 1 - (sum((y_test - preds)^2) / sum((y_test - mean(y_test))^2))
cat("Model Evaluation Metrics:\n")
## Model Evaluation Metrics:
cat(" RMSE:", rmse, "\n")
## RMSE: 1.549545
cat(" MAE :", mae, "\n")
## MAE : 0.8372956
cat(" MAPE:", mape, "%\n")
## MAPE: Inf %
cat(" R² :", r2, "\n\n")
## R² : 0.2595503
# --- Residuals ---
residuals <- y_test - preds
residual_df <- data.frame(
actual = y_test,
predicted = preds,
residuals = residuals
)
# --- Plot: Predicted vs Actual ---
p1 <- residual_df %>%
ggplot(aes(x = actual, y = predicted)) +
geom_point(alpha = 0.5) +
geom_abline(slope = 1, intercept = 0, color = "red") +
theme_minimal() +
labs(title = "Predicted vs Actual Crash Rates",
x = "Actual",
y = "Predicted")
# --- Plot: Residuals vs Predicted ---
p2 <- residual_df %>%
ggplot(aes(x = predicted, y = residuals)) +
geom_point(alpha = 0.5, color = "blue") +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
theme_minimal() +
labs(title = "Residuals vs Predicted",
x = "Predicted",
y = "Residuals")
# --- Plot: Residual Density ---
# Residual Histogram
p3 <- ggplot(residual_df, aes(x = residuals)) +
geom_histogram(binwidth = 0.2, fill = "steelblue", color = "white") +
geom_density(color = "red") +
theme_minimal() +
labs(title = "Residual Distribution", x = "Residuals", y = "Count")
# Print plots
print(p1)
print(p2)
print(p3)
ggsave("../report/plots/predicted_vs_actual_values_plot.png", p1, width = 10, height = 6, dpi = 300)
ggsave("../report/plots/resisuals_vs_predicted_values_plot.png", p2, width = 10, height = 6, dpi = 300)
ggsave("../report/plots/residual_density_plot.png", p3, width = 10, height = 6, dpi = 300)
Model Evaluation Metrics: RMSE: 1.549545 MAE : 0.8372956 MAPE: Inf % R² : 0.2595503 ## SHAP EXPLAINABILITY
# Compute SHAP values
shap_values <- shap.values(xgb_model = final_model, X_train = as.matrix(X_train))
shap_long <- shap.prep(shap_contrib = shap_values$shap_score, X_train = as.matrix(X_train))
# SHAP summary plot
print(shap.plot.summary(shap_long))
if (!dir.exists("../report/plots")) {
dir.create("../report/plots")
}
shap <- shap.plot.summary(shap_long)
ggsave("../report/plots/shap_summary_plot.png", shap, width = 10, height = 6, dpi = 300)
xgb.plot.tree(model = final_model, trees = 0)
xgb.plot.tree(model = final_model, trees = 1)
xgb.plot.tree(model = final_model, trees = 2)
xgb.plot.multi.trees(model = final_model)
# ============================================================
# Additional Model Diagnostics and Deeper Analysis
# ============================================================
library(ggplot2)
library(dplyr)
library(pdp) # For Partial Dependence Plots
library(DALEX) # For model explainability
library(ggthemes)
library(sf)
# ---------------------------
# 1. SHAP Dependence and Interaction Plots
# ---------------------------
message("\nGenerating SHAP dependence and interaction plots...")
# Assuming shap_values and shap_long are already computed
# (if not, recompute them using iml or SHAPforxgboost packages)
# Top feature by SHAP importance
top_feature <- shap_long %>%
as_tibble() %>%
count(variable, wt = abs(value), sort = TRUE) %>%
dplyr::slice(1) %>%
pull(variable)
# Dependence plot for top feature
shap.plot.dependence(data_long = shap_long, x = top_feature, color_feature = top_feature)
# Interaction values
shap_interaction_values <- predict(
final_model,
as.matrix(X_train),
predinteraction = TRUE
)
# shap_interaction_values will be a 3D array: [n_samples, n_features, n_features]
interactions <- dim(shap_interaction_values)
saveRDS(interactions, "../data/models/shap_interactions.rds")
# ---------------------------
# 3. Partial Dependence Plots (PDP)
# ---------------------------
message("\nGenerating Partial Dependence Plots...")
top_features <- shap_long %>%
count(variable, wt = abs(value), sort = TRUE) %>%
dplyr::slice(1:10) %>%
pull(variable)
# Ensure the output directory exists
dir.create("../report/plots/pdp", recursive = TRUE, showWarnings = FALSE)
for (f in top_features) {
pd <- partial(final_model, pred.var = f, train = as.matrix(X_train), grid.resolution = 30)
p <- plot(pd, main = paste("Partial Dependence of", f))
# Save as PNG
ggsave(
filename = paste0("../report/plots/pdp/pdp_", f, ".png"),
plot = p,
width = 8,
height = 6,
dpi = 300
)
}